Preambule

This document shows how to combine 3 data sets:

These three data sets provide information by district (roughly 700 in Vietnam). The difficulties come from the fact that these 3 datasets listed above do not have exactly the same districts definitions. Indeed, as the population grows with time, districts tend to split. The problems we had to deal with here are

Packages

library(sf)
library(stars)
library(osmdata)
library(magrittr)
library(stringr)
library(lubridate)
library(tidyr)
library(purrr)
library(dplyr) # best to load last
hist2 <- function(...) hist(..., main = NA)

Population density raster data

Downloading the population density data from WorldPop:

download.file("ftp://ftp.worldpop.org.uk/GIS/Population/Individual_countries/VNM/Viet_Nam_100m_Population/VNM_ppp_v2b_2020_UNadj.tif",
              "VNM_ppp_v2b_2020_UNadj.tif")

Loading the data:

worldpop <- read_stars("VNM_ppp_v2b_2020_UNadj.tif")

Google and Apple mobility data

Downloading the population mobility data:

download.file("https://www.dropbox.com/s/6fl62gcuma9890f/google.rds?raw=1", "google.rds")
download.file("https://www.dropbox.com/s/uuxxjm3cgs0a4gw/apple.rds?raw=1", "apple.rds")

Loading the data:

google <- readRDS("google.rds")
apple <- readRDS("apple.rds") %>% 
  mutate_if(is.numeric, subtract, 100)

Polygons from GADM

Downloading the polygons from GADM:

download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_0_sf.rds", "gadm36_VNM_0_sf.rds")
download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_1_sf.rds", "gadm36_VNM_1_sf.rds")
download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_2_sf.rds", "gadm36_VNM_2_sf.rds")
download.file("https://biogeo.ucdavis.edu/data/gadm2.8/rds/VNM_adm2.rds"       , "VNM_adm2.rds")
download.file("https://biogeo.ucdavis.edu/data/gadm2.8/rds/VNM_adm3.rds"       , "VNM_adm3.rds")

Loading the polygons:

vn0 <- readRDS("gadm36_VNM_0_sf.rds")     # country polygon
vn1 <- readRDS("gadm36_VNM_1_sf.rds")     # provinces polygons

vn2 <- readRDS("gadm36_VNM_2_sf.rds") %>% # districts polygons
  transmute(province = str_squish(NAME_1),
            district = str_squish(NAME_2) %>%
              str_remove_all("Thành Phố |Thị Xã |Quận "))

vn2_old <- readRDS("VNM_adm2.rds") %>%    # old version of the districts polygons
  st_as_sf() %>% 
  transmute(province = str_squish(NAME_1),
            district = str_squish(NAME_2) %>% 
              str_remove_all("Thành Phố |Thị Xã |Quận ") %>% 
              str_replace("Chiêm Hoá", "Chiêm Hóa"))

vn3_old <- readRDS("VNM_adm3.rds") %>%    # old version of the communes polygons
  st_as_sf() %>% 
  transmute(province = str_squish(NAME_1),
            district = str_squish(NAME_2) %>% 
              str_remove_all("Thành Phố |Thị Xã |Quận ") %>% 
              str_replace("Chiêm Hoá", "Chiêm Hóa"),
            commune = str_squish(NAME_3)) %>% 
  arrange(province, district, commune)

Removing the commune Huæi Luông from the district Sìn Hồ:

vn2_old %<>% 
  filter(district == "Sìn Hồ") %>% 
  st_set_geometry(st_union(filter(vn3_old, district == "Sìn Hồ", commune != "Huæi Luông"))) %>% 
  rbind(filter(vn2_old, district != "Sìn Hồ")) %>% 
  arrange(province, district)

Adding the polygon for Côn Đảo from OpenStreetMap

Downloading the administrative boundaries from OpenStreetMap:

con_dao <- c(106.523384, 8.622214, 106.748218, 8.782639) %>%
  opq() %>% 
  add_osm_feature(key = "boundary", value = "administrative") %>% 
  osmdata_sf()

The main island is made of lines. Let’s transform them into a polygon:

main_island <- con_dao$osm_lines %>%
  st_combine() %>%
  st_polygonize() %>%
  st_geometry() %>%
  st_collection_extract("POLYGON")

The other islands are already polygons. Let’s extract and merge them with the newly created polygon of the main island:

con_dao <- con_dao$osm_polygons %>%
  st_geometry() %>% 
  c(main_island) %>% 
  st_multipolygon() %>% 
  st_sfc(crs = st_crs(vn2))

Here are the polygons for Côn Đảo:

con_dao %>% 
  st_geometry() %>% 
  plot(col = "grey")

Let’s add the polygon of Côn Đảo to the GADM data:

vn2 %<>%
  filter(province == "Bà Rịa - Vũng Tàu") %>% 
  head(1) %>% 
  mutate(district = "Côn Đảo") %>% 
  st_set_geometry(con_dao) %>% 
  rbind(vn2)

Adding the census 2019 data

For some reason the province of Vĩnh Long is missing… We add information form Wikipedia:

census <- census_path %>%
  readRDS() %>% 
  group_by(province, district) %>% 
  summarise(n = sum(n)) %>% 
  ungroup() %>% 
  mutate(province = str_squish(province) %>%
                    str_remove_all("Thành phố |Tỉnh ") %>% 
                    str_replace("Đăk Lăk"             , "Đắk Lắk") %>% 
                    str_replace("Đăk Nông"            , "Đắk Nông") %>% 
                    str_replace("Khánh Hoà"           , "Khánh Hòa") %>% 
                    str_replace("Thanh Hoá"           , "Thanh Hóa"),
         district = str_squish(district) %>%
                    str_replace("Thành phố Cao Lãnh"  , "Cao Lãnh (Thành phố)") %>% 
                    str_replace("Thị xã Hồng Ngự"     , "Hồng Ngự (Thị xã)") %>% 
                    str_replace("Thị xã Kỳ Anh"       , "Kỳ Anh (Thị xã)") %>% 
                    str_replace("Thị xã Long Mỹ"      , "Long Mỹ (Thị xã)") %>% 
                    str_replace("Thị xã Cai Lậy"      , "Cai Lậy (Thị xã)") %>% 
                    str_replace("Thị xã Duyên Hải"    , "Duyên Hải (Thị xã)") %>% 
                    str_remove_all("Huyện |Huỵên |Quận |Thành phố |Thành Phô |Thành Phố |Thị xã |Thị Xã ") %>% 
                    str_replace("Hoà Vang"            , "Hòa Vang") %>% 
                    str_replace("Ứng Hoà"             , "Ứng Hòa") %>% 
                    str_replace("Đồng Phù"            , "Đồng Phú") %>% 
                    str_replace("Đắk Glong"           , "Đăk Glong") %>% 
                    str_replace("Ia H’Drai"           , "Ia H' Drai") %>% 
                    str_replace("ý Yên"               , "Ý Yên") %>% 
                    str_replace("Bác ái"              , "Bác Ái") %>% 
                    str_replace("Phan Rang- Tháp Chàm", "Phan Rang-Tháp Chàm") %>% 
                    str_replace("Đông Hoà"            , "Đông Hòa") %>% 
                    str_replace("Tuy Hòa"             , "Tuy Hoà") %>% 
                    str_replace("Thiệu Hoá"           , "Thiệu Hóa")) %>% 
  bind_rows(
    bind_cols(
      data.frame(province = rep("Vĩnh Long", 8)),
      tribble(
        ~district  ,     ~n,
        "Bình Tân" ,  93758, # 2009
        "Long Hồ"  , 160537, # 2018
        "Mang Thít", 103573, # 2018
        "Tam Bình" , 162191, # 2003
        "Trà Ôn"   , 149983, # 2003
        "Vũng Liêm", 176233, # 2003
        "Bình Minh",  95282, # 2003
        "Vĩnh Long", 200120  # 2018
      )
    ),
    tribble(
      ~province  , ~district     , ~n, 
      "Điện Biên", "Mường Ảng"   ,  37077, # 2006
      "Hải Phòng", "Bạch Long Vĩ",    912, # 2018
      "Phú Thọ"  , "Thanh Sơn"   , 187700, # 2003
      "Quảng Trị", "Cồn Cỏ"      ,    400  # 2003
    )
  )
identical(sort(unique(census$province)), sort(unique(vn2$province)))
## [1] TRUE
anti_join(census, vn2, c("province", "district"))
## # A tibble: 0 x 3
## # … with 3 variables: province <chr>, district <chr>, n <dbl>
anti_join(vn2, census, c("province", "district"))
## Simple feature collection with 0 features and 2 fields
## bbox:           xmin: NA ymin: NA xmax: NA ymax: NA
## CRS:            EPSG:4326
## [1] province district geometry
## <0 rows> (or 0-length row.names)
vn2 %<>% left_join(census, c("province", "district"))
vn2 %>% 
  st_drop_geometry() %>% 
  group_by(province, district) %>% 
  tally() %>% 
  filter(n > 1)
## # A tibble: 0 x 3
## # Groups:   province [0]
## # … with 3 variables: province <chr>, district <chr>, n <int>

Merging some districts

The colocation data use the Bing polygons, which do not seem to be very much up to date. In order to adjust for that, we need to merge the following districts in the GADM data:

Furthermore, some of the districts need to be split in two, with each part merged to a different district. That’s the case for these 3 districts:

As illustrated below:

plot_districts <- function(d1, d2, d3) {
  tmp <- vn2 %>% 
    filter(district %in% c(d1, d2, d3)) %>% 
    st_geometry()
  
  plot(tmp)
  plot(worldpop, add = TRUE, main = NA)
  plot(tmp, add = TRUE, col = adjustcolor("green", .2))
  
  vn2 %>% 
    filter(district == d1) %>% 
    st_geometry() %>% 
    plot(add = TRUE, col = adjustcolor("red", .2))
  
  vn2_old %>% 
    filter(district %in% c(d2, d3)) %>% 
    st_geometry() %>% 
    plot(add = TRUE, border = "blue", lwd = 2)
}
plot_districts("Nậm Pồ"  , "Mường Chà", "Mường Nhé")

plot_districts("Lâm Bình", "Nà Hang"  , "Chiêm Hóa")

plot_districts("Nậm Nhùn", "Mường Tè" , "Sìn Hồ")

The raster data shows the population density from WorldPop that we’ll use to split the population of the red district into the 2 blue districts. Here are two functions to fix these 2 above-mentioned problems. Here is the first function:

merge_districts <- function(vn, d1, d2, p) {
  dst <- c(d1, d2)
  
  tmp <- vn %>% 
    filter(district %in% dst, province == p) %>% 
    mutate(n = sum(n))
  
  tmp %<>% 
    filter(district %in% d1) %>% 
    st_set_geometry(st_union(tmp))
  
  vn %>% 
    filter(! (district %in% dst & province == p)) %>% 
    rbind(tmp) %>% 
    arrange(province, district)
}

The second function needs this function:

proportion <- function(to_cut, one_district, new_vn = vn2, old_vn = vn2_old, rstr = worldpop) {
  to_cut <- filter(new_vn, district == to_cut)
  one_part <- st_intersection(to_cut, filter(old_vn, district == one_district))
  
  wp0 <- rstr %>%
    st_crop(to_cut) %>% 
    st_as_stars() %>% 
    unlist() %>% 
    sum(na.rm = TRUE)
  
  rstr %>% 
    st_crop(one_part) %>% 
    st_as_stars() %>% 
    unlist() %>% 
    sum(na.rm = TRUE) %>% 
    divide_by(wp0)
}

Let’s try it:

proportion("Nậm Nhùn", "Mường Tè" , vn2, vn2_old, worldpop)
## [1] 0.6717576
proportion("Nậm Pồ"  , "Mường Chà", vn2, vn2_old, worldpop)
## [1] 0.4612189
proportion("Lâm Bình", "Nà Hang"  , vn2, vn2_old, worldpop)
## [1] 0.7201902

And here is the second function we needed:

merge_back_districts <- function(c2, d1, d2, d3, c1 = vn2_old, rstr = worldpop) {
  dsts <- c(d1, d2, d3)
  
  tmp <- c2 %>% 
    filter(district %in% dsts) %$%
    setNames(n, district)

  half1 <- round(proportion(d1, d2, c2, c1, rstr) * tmp[d1])

  half2 <- tmp[d1] - half1
  tmp[d2] <- tmp[d2] + half1
  tmp[d3] <- tmp[d3] + half2
  tmp <- tmp[dsts[-1]]
  
  c1 %>% 
    filter(district %in% dsts[-1]) %>% 
    mutate(n = tmp) %>% 
    select(everything(), geometry) %>% 
    rbind(filter(c2, ! district %in% dsts)) %>% 
    arrange(province, district)
}

Let’s now call these 2 functions to do the mergings:

vn2 %<>%
  merge_districts("Kỳ Anh"     , "Kỳ Anh (Thị xã)"   , "Hà Tĩnh") %>% 
  merge_districts("Long Mỹ"    , "Long Mỹ (Thị xã)"  , "Hậu Giang") %>% 
  merge_districts("Cai Lậy"    , "Cai Lậy (Thị xã)"  , "Tiền Giang") %>% 
  merge_districts("Duyên Hải"  , "Duyên Hải (Thị xã)", "Trà Vinh") %>% 
  merge_districts("Tân Uyên"   , "Bắc Tân Uyên"      , "Bình Dương") %>%
  merge_districts("Bến Cát"    , "Bàu Bàng"          , "Bình Dương") %>% 
  merge_districts("Bắc Từ Liêm", "Nam Từ Liêm"       , "Hà Nội") %>% # then rename to Từ Liêm
  merge_districts("Mộc Hóa"    , "Kiến Tường"        , "Long An") %>% 
  merge_districts("Quỳnh Lưu"  , "Hoàng Mai"         , "Nghệ An") %>% 
  merge_districts("Quảng Trạch", "Ba Đồn"            , "Quảng Bình") %>% 
  merge_back_districts("Nậm Pồ"  , "Mường Chà", "Mường Nhé") %>% 
  merge_back_districts("Nậm Nhùn", "Mường Tè" , "Sìn Hồ") %>% 
  merge_back_districts("Lâm Bình", "Nà Hang"  , "Chiêm Hóa") %>% 
  mutate(district = str_replace(district, "Bắc Từ Liêm", "Từ Liêm")) # here the renaming

Let’s calculate the areas:

vn2 %<>% 
  mutate(area_km2 = vn2 %>%
           st_geometry() %>% 
           st_area() %>% 
           as.numeric() %>% 
           divide_by(1e6)) %>% 
  select(everything(), geometry)

Visualizations

Population sizes

hist2(vn2$n, n = 50, xlab = "population size", ylab = "number of districts", axes = FALSE)
axis(1, seq(0, 1e6, 2e5), format(seq(0, 10, 2) * 1e5, big.mark = ",", scientific = FALSE, trim = TRUE))
axis(2)

cb <- RColorBrewer::brewer.pal(9, "YlOrBr")
color_generator <- colorRampPalette(cb)
pal <- color_generator(10)
hist2(vn2$n, quantile(vn2$n, seq(0, 1, le = 11)), col = pal, axes = FALSE,
      xlab = "population size", ylab = "density of probability")
axis(1, seq(0, 1e6, 2e5), format(seq(0, 10, 2) * 1e5, big.mark = ",", scientific = FALSE, trim = TRUE))
axis(2)

vn2 %>% 
  st_geometry() %>% 
  plot(lwd = .1, col = pal[cut(vn2$n, quantile(vn2$n, seq(0, 1, le = 11)))], main = NA)

vn0 %>% 
  st_geometry() %>% 
  plot(add = TRUE)

mean(vn2$n)
## [1] 139978.6
median(vn2$n)
## [1] 122217

Areas

hist2(vn2$area_km2, n = 50,
      xlab = expression(paste("area ", (km^2))), ylab = "number of districts")

mean(vn2$area_km2)
## [1] 471.8696
median(vn2$area_km2)
## [1] 339.3427

Densities

(quants <- quantile(vn2$n / vn2$area_km2, c(.025, .25, .5, .75, .975)))
##        2.5%         25%         50%         75%       97.5% 
##    34.62209   115.50101   397.09968  1019.86255 20341.46013
hist2(log10(vn2$n / vn2$area_km2), n = 50, axes = FALSE,
      xlab = expression(paste("density (/", km^2, ")")), ylab = "number of districts")
axis(1, 1:4, c("10", "100", "1000", "10000"))
axis(2)
abline(v = log10(quants), lty = c(3, 2, 1, 2, 3), col = "blue", lwd = 2)

xs <- log10(vn2$n / vn2$area_km2)
hist2(xs, quantile(xs, seq(0, 1, le = 11)), col = pal, axes = FALSE,
      xlab = expression(paste("density (/", km^2, ")")), ylab = "density of probability")
axis(1, 1:4, c("10", "100", "1000", "10000"))
axis(2)

vn2 %>% 
  transmute(dens = n / area_km2) %>% 
  st_geometry() %>% 
  plot(lwd = .1, col = pal[cut(vn2$n, quantile(vn2$n, seq(0, 1, le = 11)))], main = NA)

vn0 %>% 
  st_geometry() %>% 
  plot(add = TRUE)

The relationship between the population size and the population density:

vn2 %>% 
  mutate(dens = n / area_km2) %$%
  plot(log10(n), log10(dens), col = "blue", axes = FALSE, xlab = "population size",
       ylab = expression(paste("population density (/", km^2, ")")))

axis(1, 3:6, format(10^(3:6), big.mark = ",", scientific = FALSE, trim = TRUE))
axis(2, 1:4, format(10^(1:4), big.mark = ",", scientific = FALSE, trim = TRUE))

Colocation data

The list of colocation data files:

files <- dir(colocation_path)

There are 17 of them:

length(files)
## [1] 17

Making names from files names:

weeks <- str_remove_all(files, "^.*__|.csv")

Loading the colocation data into a list (one slot per week):

colocation <- paste0(colocation_path, dir(colocation_path)) %>%
  map(readr::read_csv) %>%
  setNames(weeks)

The colocation data looks like this:

head(colocation, 1)
## $`2020-03-03`
## # A tibble: 448,665 x 15
##    polygon1_id polygon1_name lon_1 lat_1 name_stack_1 fb_population_1
##          <dbl> <chr>         <dbl> <dbl> <chr>                  <dbl>
##  1      834298 Huyện Lý Sơn   109.  15.4 Quảng Ngãi …             156
##  2      834671 Huyện Đạ Tẻh   108.  11.6 Lâm Đồng Pr…             123
##  3      834269 Huyện Hương …  107.  16.4 Thừa Thiên …             850
##  4      834290 Huyện Hiệp Đ…  108.  15.5 Quảng Nam P…             206
##  5      834320 Thị Xã Bình …  107.  11.7 Bình Phước …             991
##  6      834672 Huyện Cát Ti…  107.  11.7 Lâm Đồng Pr…             109
##  7      834652 Huyện Tuy Ph…  109.  11.4 Bình Thuận …            1555
##  8      834798 Huyện Đan Ph…  106.  21.1 Hanoi City …            2640
##  9      834599 Huyện Đông H…  106.  20.5 Thái Bình P…            1819
## 10      834336 Huyện Vĩnh H…  106.  10.9 Long An Pro…             422
## # … with 448,655 more rows, and 9 more variables: polygon2_id <dbl>,
## #   polygon2_name <chr>, lon_2 <dbl>, lat_2 <dbl>, name_stack_2 <chr>,
## #   fb_population_2 <dbl>, link_value <dbl>, country <chr>, ds <date>

Getting rid off whatever is not linked to Vietnam

The problem

Here we show what the problem is (i.e. some of the data are outside Vietnam). The following function plots the colocation data:

plot_fb <- function(df, xlim, ylim, col) {
  plot(xlim, ylim, asp = 1, xlab = "longitude", ylab = "latitude", type = "n")
  maps::map(col = "grey", fill = TRUE, add = TRUE)
  points(df[[1]], df[[2]], pch = ".", col = col)
  axis(1)
  axis(2)
  box(bty = "o")
}

Let’s consider one week:

june23 <- colocation$`2020-06-23`

The locations in this week are within these boundaries:

(xlim <- range(range(june23$lon_1), range(june23$lon_2)))
## [1] 101.5709 109.3583
(ylim <- range(range(june23$lat_1), range(june23$lat_2)))
## [1]  8.668113 23.237631

Let’s plot the polygon 1:

june23 %>%
  select(lon_1, lat_1) %>% 
  distinct() %>% 
  plot_fb(xlim, ylim, "blue")

And the polygon 2:

june23 %>%
  select(lon_2, lat_2) %>% 
  distinct() %>% 
  plot_fb(xlim, ylim, "red")

The solution

In the following function, df is a data frame with the same column names as a “colocation map” data frame. pl is an sf non-projected polygon. type is either 1 or 2.

pts_in_pol <- function(type, df, pl, project = FALSE) {
  # assumes that sf is not projected.
  # 4326: non-projected
  # 3857: pseudo-Mercator (e.g. Google Maps)
  df %<>% st_as_sf(coords = paste0(c("lon_", "lat_"), type), crs = 4326)
  if (project) {
    df %<>% st_transform(3857)
    pl %<>% st_transform(3857)
  }
  df %>%
    st_intersects(pl) %>% 
    map_int(length)
}

The function returns a vector of length equal to the number of rows of df with 1 if the points is inside the polygon pl and 0 otherwise. Let’s try it:

tmp <- pts_in_pol(1, june23, vn0)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar

It takes 3.5’ and it’s about 3 times slower if we project the points and polygon. The arguments of the following function are the same as the previous one. It uses the previous one to delete the records from df that have start and end coordinates that are outside pl:

rcd_in_pol <- function(df, pl, project = FALSE) {
  require(magrittr)
  1:2 %>%
    parallel::mclapply(pts_in_pol, df, pl, project, mc.cores = 2) %>% 
    as.data.frame() %>%
    rowSums() %>% 
    is_greater_than(0) %>% 
    magrittr::extract(df, ., ) # there is an extract() function in tidyr too
}

Let’s try it:

june23 %<>% rcd_in_pol(vn0)

Let’s process all the data (takes about 1 minute):

colocation %<>% map(rcd_in_pol, vn0)

Exploring the colocation data

Interesting to see, by the way, the effect of the lockdown:

nd <- nrow(vn2)
ld <- ymd(c(20200401, 20200423))
ys <- c(-1e5, 6e5)
xs <- ymd(names(colocation)) - 6
nb_links <- map_dbl(colocation, nrow)
xlim <- ymd(c("2020-02-29", "2020-07-01"))
plot(xs, nb_links, type = "s", xlab = NA, ylab = "number of links",
     xlim = xlim, ylim = c(0, nd^2), col = "blue", lwd = 2)
abline(h = nd^2, lty = 2, col = "grey")
polygon(c(ld, rev(ld)), rep(ys, each = 2), col = adjustcolor("red", .25), border = NA)

Let’s look at the sum of the probability per week:

link_val <- map(colocation, pull, link_value)
plot(xs, map_dbl(link_val, sum), type = "s", col = "blue", xlim = xlim,
     xlab = NA, ylab = "sums of probabilities", lwd = 2)
polygon(c(ld, rev(ld)), rep(ys, each = 2), col = adjustcolor("red", .25), border = NA)

The 10th week looks suspicious. There is no missing value and no zeros in the link variable:

link_val %<>% unlist()
any(is.na(link_val))
## [1] FALSE
range(link_val)
## [1] 2.716479e-12 2.276358e-01

Working out a district ID common to GADM + census and colocation data

The list of district in the colocation data:

col_names <- c("polygon_id", "polygon_name", "lon", "lat", "name_stack")

districts1 <- map_dfr(colocation, select, polygon1_id, polygon1_name, lon_1, lat_1, name_stack_1) %>% 
  setNames(col_names)

districts2 <- map_dfr(colocation, select, polygon2_id, polygon2_name, lon_2, lat_2, name_stack_2) %>% 
  setNames(col_names)

districts <- bind_rows(districts1, districts2) %>%
  distinct()

This is what it looks like:

districts
## # A tibble: 693 x 5
##    polygon_id polygon_name       lon   lat name_stack                           
##         <dbl> <chr>            <dbl> <dbl> <chr>                                
##  1     834298 Huyện Lý Sơn      109.  15.4 Quảng Ngãi Province // Huyện Lý Sơn  
##  2     834671 Huyện Đạ Tẻh      108.  11.6 Lâm Đồng Province // Huyện Đạ Tẻh    
##  3     834269 Huyện Hương Trà   107.  16.4 Thừa Thiên Huế Province // Huyện Hươ…
##  4     834290 Huyện Hiệp Đức    108.  15.5 Quảng Nam Province // Huyện Hiệp Đức 
##  5     834320 Thị Xã Bình Long  107.  11.7 Bình Phước Province // Thị Xã Bình L…
##  6     834672 Huyện Cát Tiên    107.  11.7 Lâm Đồng Province // Huyện Cát Tiên  
##  7     834652 Huyện Tuy Phong   109.  11.4 Bình Thuận Province // Huyện Tuy Pho…
##  8     834798 Huyện Đan Phượng  106.  21.1 Hanoi City // Huyện Đan Phượng       
##  9     834599 Huyện Đông Hưng   106.  20.5 Thái Bình Province // Huyện Đông Hưng
## 10     834336 Huyện Vĩnh Hưng   106.  10.9 Long An Province // Huyện Vĩnh Hưng  
## # … with 683 more rows

Province name missing for some districts of Hanoi

In the colocation data, the name_stack_* variables contains the names of the province and the district separated by //. The problem is that there are a number of districts that do not have // in their name_stack variable and all of them seem to be in the province of Hanoi:

plot(st_geometry(vn0), col = "grey")

vn1 %>%
  filter(NAME_1 == "Hà Nội") %>%
  st_geometry() %>%
  plot(add = TRUE, col = "yellow")

districts %>% 
  filter(! grepl(" // ", name_stack)) %>% 
  st_as_sf(coords = c("lon", "lat"), crs = 4326) %>% 
  st_geometry() %>% 
  plot(add = TRUE, col = "red")

Separating province name from district name in the colocation data

Plus a number of names fixes:

districts %<>% 
  separate(name_stack, c("province", "district"), " // ") %>% 
  mutate(indicate = is.na(district),
         district = ifelse(indicate, province, district),
         province = ifelse(indicate, "Hanoi City", province) %>% 
           str_squish() %>% 
           str_remove(" Province| City") %>% 
           str_replace("-", " - ") %>% 
           str_replace("Da Nang"     , "Đà Nẵng") %>%
           str_replace("Hanoi"       , "Hà Nội") %>% 
           str_replace("Hai Phong"   , "Hải Phòng") %>%
           str_replace("Ho Chi Minh" , "Hồ Chí Minh") %>%
           str_replace("Hòa Bình"    , "Hoà Bình"),
         polygon_name = str_squish(polygon_name) %>% 
           str_replace("Thành Phố Cao Lãnh", "Cao Lãnh (Thành phố)") %>% 
           str_replace("Thị Xã Hồng Ngự", "Hồng Ngự (Thị xã)") %>% 
           str_remove("Huyện |Thành phố |Thị xã |Quận |Thành Phố |Thị Xã ") %>%
           str_replace("Quy Nhơn"    , "Qui Nhơn") %>% 
           str_replace("Đảo Phú Quý" , "Phú Quí") %>% 
           str_replace("Bình Thủy"   , "Bình Thuỷ") %>% 
           str_replace("Hòa An"      , "Hoà An") %>% 
           str_replace("Phục Hòa"    , "Phục Hoà") %>% 
           str_replace("Thái Hòa"    , "Thái Hoà") %>% 
           str_replace("Hạ Hòa"      , "Hạ Hoà") %>% 
           str_replace("Phú Hòa"     , "Phú Hoà") %>% 
           str_replace("Tây Hòa"     , "Tây Hoà") %>% 
           str_replace("Tuy Hòa"     , "Tuy Hoà") %>% 
           str_replace("Krông Ana"   , "Krông A Na") %>% 
           str_replace("Krông A Na"  , "Krông A Na") %>% ##
           str_replace("Krông Păk"   , "Krông Pắc") %>% 
           str_replace("Krông Pắc"   , "Krông Pắc") %>% ##
           str_replace("Đắk Glong"   , "Đăk Glong") %>% 
           str_replace("Đắk Rlấp"    , "Đắk R'Lấp") %>% 
           str_replace("A Yun Pa"    , "Ayun Pa") %>% 
           str_replace("Từ Liêm"     , "Nam Từ Liêm") %>% 
           str_replace("Kiến Thụy"   , "Kiến Thuỵ") %>% 
           str_replace("Thủy Nguyên" , "Thuỷ Nguyên") %>% 
           str_replace("Vị Thủy"     , "Vị Thuỷ") %>% 
           str_replace("Bác Ai"      , "Bác Ái") %>% 
           str_replace("Thanh Thủy"  , "Thanh Thuỷ") %>% 
           str_replace("Yên Hưng"    , "Quảng Yên") %>% 
           str_replace("Na Hang"     , "Nà Hang") %>% 
           str_replace("Mù Cang Chải", "Mù Căng Chải") %>%
           str_replace("M`Đrắk"      , "M'Đrắk") %>% 
           str_replace("Cư M`Gar"    , "Cư M'gar") %>% 
           str_replace("Ea H`Leo"    , "Ea H'leo") %>% 
           str_replace("Nam Từ Liêm" , "Từ Liêm") %>% 
           str_replace("Buôn Hồ"     , "Buôn Hồ"),
         polygon_name = ifelse(province == "Bạc Liêu" & polygon_name == "Hòa Bình",
                               "Hoà Bình", polygon_name)) %>% 
  select(-indicate, -district) %>% 
  rename(district = polygon_name)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [82, 210,
## 309, 593].

The warnings are produced when the provinces names are missing for some of the districts of Hanoi. Separating the districts names per province for both sources:

anti_join(districts, vn2, c("province", "district"))
## # A tibble: 0 x 5
## # … with 5 variables: polygon_id <dbl>, district <chr>, lon <dbl>, lat <dbl>,
## #   province <chr>
anti_join(vn2, districts, c("province", "district"))
## Simple feature collection with 5 features and 4 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 104.636 ymin: 11.60531 xmax: 107.7464 ymax: 21.04582
## CRS:            +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
##     province     district     n    area_km2                       geometry
## 1 Bình Phước    Phú Riềng 97931  667.330016 MULTIPOLYGON (((106.8315 11...
## 2  Hải Phòng Bạch Long Vĩ   912   15.273999 MULTIPOLYGON (((107.7457 20...
## 3    Kon Tum   Ia H' Drai  6638 1159.609633 MULTIPOLYGON (((107.4588 13...
## 4  Quảng Trị       Cồn Cỏ   400    2.176718 MULTIPOLYGON (((107.343 17....
## 5     Sơn La       Vân Hồ 61197  969.335926 MULTIPOLYGON (((105.0134 20...

Let’s map these districts that never have any data:

vn0 %>% 
  st_geometry() %>% 
  plot(col = "grey")

tmp <- vn2 %>% 
  mutate(pd = paste(province, district)) %>% 
  filter(! pd %in% with(districts, paste(province, district)))

tmp %>% 
  st_geometry() %>% 
  plot(add = TRUE, col = "red")

tmp %>%
  st_centroid() %>% 
  st_coordinates() %>% 
  as_tibble() %>% 
  filter(between(Y, 15, 20.5)) %>%
  st_as_sf(coords = c("X", "Y")) %>% 
  plot(add = TRUE, col = "red")

Let’s add these 5 districts to districts:

tmp <- vn2 %>% 
  mutate(pd = paste(province, district)) %>% 
  filter(! pd %in% with(districts, paste(province, district)))

districts <- tmp %>%
  st_centroid() %>% 
  st_coordinates() %>% 
  as_tibble() %>% 
  transmute(lon = X, lat = Y) %>% 
  bind_cols(tmp) %>% 
  mutate(polygon_id = 850001:850005) %>% 
  select(polygon_id, district, lon, lat, province) %>% 
  bind_rows(districts)

This is what it looks like now:

districts
## # A tibble: 698 x 5
##    polygon_id district       lon   lat province      
##         <dbl> <chr>        <dbl> <dbl> <chr>         
##  1     850001 Phú Riềng     107.  11.7 Bình Phước    
##  2     850002 Bạch Long Vĩ  108.  20.1 Hải Phòng     
##  3     850003 Ia H' Drai    108.  14.2 Kon Tum       
##  4     850004 Cồn Cỏ        107.  17.2 Quảng Trị     
##  5     850005 Vân Hồ        105.  20.8 Sơn La        
##  6     834298 Lý Sơn        109.  15.4 Quảng Ngãi    
##  7     834671 Đạ Tẻh        108.  11.6 Lâm Đồng      
##  8     834269 Hương Trà     107.  16.4 Thừa Thiên Huế
##  9     834290 Hiệp Đức      108.  15.5 Quảng Nam     
## 10     834320 Bình Long     107.  11.7 Bình Phước    
## # … with 688 more rows

Let’s merge this information with the polygon / census data:

vn2 %<>% 
  left_join(districts, c("province", "district")) %>% 
  select(polygon_id, province, district, n, area_km2, lon, lat, geometry)

and this what it looks like now:

vn2
## Simple feature collection with 698 features and 7 fields
## geometry type:  GEOMETRY
## dimension:      XY
## bbox:           xmin: 102.145 ymin: 8.381355 xmax: 109.4692 ymax: 23.39269
## CRS:            +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
## First 10 features:
##    polygon_id province   district      n area_km2      lon      lat
## 1      834464 An Giang     An Phú 183074 225.1253 105.1009 10.84574
## 2      834463 An Giang   Châu Đốc 117244 104.0911 105.0873 10.67470
## 3      834466 An Giang   Châu Phú 255894 450.9987 105.1797 10.55759
## 4      834468 An Giang Châu Thành 176945 354.9705 105.2659 10.42124
## 5      834469 An Giang    Chợ Mới 353705 369.1722 105.4614 10.47487
## 6      834462 An Giang Long Xuyên 303381 114.2415 105.4280 10.37121
## 7      834465 An Giang    Phú Tân 214823 311.6955 105.2758 10.65475
## 8      834873 An Giang   Tân Châu 176007 172.0770 105.1859 10.79910
## 9      834470 An Giang  Thoại Sơn 189389 468.9065 105.2556 10.29679
## 10     834467 An Giang  Tịnh Biên 124798 354.1079 105.0113 10.54809
##                          geometry
## 1  MULTIPOLYGON (((105.1216 10...
## 2  MULTIPOLYGON (((105.1387 10...
## 3  MULTIPOLYGON (((105.327 10....
## 4  MULTIPOLYGON (((105.3082 10...
## 5  MULTIPOLYGON (((105.4729 10...
## 6  MULTIPOLYGON (((105.4729 10...
## 7  MULTIPOLYGON (((105.327 10....
## 8  MULTIPOLYGON (((105.2463 10...
## 9  MULTIPOLYGON (((105.2532 10...
## 10 MULTIPOLYGON (((105.1139 10...

Exploring further the colocation data

colocation$`2020-03-03` %>%
  group_by(polygon1_id) %>%
  tally() %>% 
  right_join(select(districts, c("polygon1_id" = "polygon_id"))) %>% 
  mutate(n = replace_na(n, 0))
## Joining, by = "polygon1_id"
## # A tibble: 698 x 2
##    polygon1_id     n
##          <dbl> <dbl>
##  1      850001     0
##  2      850002     0
##  3      850003     0
##  4      850004     0
##  5      850005     0
##  6      834298   387
##  7      834671   458
##  8      834269   615
##  9      834290   426
## 10      834320   542
## # … with 688 more rows
nb_links <- function(x) {
  x %>%
    group_by(polygon1_id) %>%
    tally() %>% 
    right_join(select(districts, c("polygon1_id" = "polygon_id"))) %>% 
    mutate(n = replace_na(n, 0)) %>% 
    pull(n)
}
cols <- RColorBrewer::brewer.pal(9, "Blues")

plot(xs, seq_along(xs), ylim = c(0, 700), type = "n", xlim = xlim,
     xlab = NA, ylab = "number of districts connected to")

tmp <- colocation %>% 
  map(nb_links) %>%
  map_dfc(quantile, 0:10 / 10) %>% 
  t() %>% 
  as.data.frame()
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
## Joining, by = "polygon1_id"
xs2 <- xs + 3

for (i in 1:5)
  polygon(c(xs2, rev(xs2)), c(tmp[[i]], rev(tmp[[12 - i]])), col = cols[i], border = NA)

lines(xs2, tmp[[6]], col = cols[6])
abline(h = nrow(districts), col = "grey", lty = 2)
polygon(c(ld, rev(ld)), rep(ys, each = 2), col = adjustcolor("red", .1), border = NA)

Rearranging the colocation data into a matrix

Let’s generate a template with all the combinations of districts:

template <- districts %>% 
  arrange(polygon_id) %$%
  expand.grid(polygon_id, polygon_id) %>% 
  as_tibble() %>% 
  setNames(c("polygon1_id", "polygon2_id"))

The function that transforms the colocation data into a matrix:

to_matrix <- function(df, template) {
  dim_names <- sort(unique(template$polygon1_id))
  df %>% 
    select(polygon1_id, polygon2_id, link_value) %>% 
    left_join(template, ., c("polygon1_id", "polygon2_id")) %>%
    mutate(link_value = replace_na(link_value, 0)) %>% 
    pull(link_value) %>% 
    matrix(nrow(districts)) %>%
    `colnames<-`(dim_names) %>% 
    `rownames<-`(dim_names)
}

Let’s do it for all the weeks and sum them:

coloc_mat <- colocation %>% 
  map(to_matrix, template) %>% 
  reduce(`+`)

Let’s have a look at this matrix. Let’s first order the district from south to north and from west to east:

hash <- setNames(seq_along(colnames(coloc_mat)),
                           colnames(coloc_mat))
ind <- districts %>% 
  arrange(lat, lon) %>% 
  pull(polygon_id) %>% 
  as.character() %>% 
  magrittr::extract(hash, .)

coloc_mat <- coloc_mat[ind, ind]

Let’s now plot the matrix:

opar <- par(pty = "s")
image(log10(apply(t(coloc_mat), 2, rev)), axes = FALSE)
par(opar)
box(bty = "o")

We can verify that the matrix is symmetric and that both Hanoi and Saigon are well connected to every where in the country. We can see also the district that are not connected at all.

Subsetting according to coordinates

Let’s say we want to select all the provinces from the Northern EPI, the southernmost province of which is Nghệ An. Let’s retrieve the latitude of the centroids of all the provinces:

tmp <- vn1 %>% 
  st_centroid() %>% 
  st_coordinates() %>% 
  as_tibble() %>% 
  pull(Y) %>% 
  mutate(vn1, lat_cent = .) %>% 
  select(NAME_1, lat_cent)
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over
## geometries of x
## Warning in st_centroid.sfc(st_geometry(x), of_largest_polygon =
## of_largest_polygon): st_centroid does not give correct centroids for longitude/
## latitude data

The province south of Nghệ An is Hà Tĩnh, the latitude of centroid of which is:

threshold <- tmp %>% 
  filter(NAME_1 == "Hà Tĩnh") %>% 
  pull(lat_cent)

Now, we retrieve all the names of all the provinces that are north of this threshold:

northernEPI <- tmp %>% 
  filter(lat_cent > threshold) %>% 
  pull(NAME_1)

Now, we retrieve the corresponding districts’ ID:

sel <- districts %>% 
  filter(province %in% northernEPI) %>% 
  pull(polygon_id)

And now we can subset our matrix:

sel <- colnames(coloc_mat) %in% sel
coloc_mat_northernEPI <- coloc_mat[sel, sel]

Which gives:

opar <- par(pty = "s")
image(log10(apply(t(coloc_mat_northernEPI), 2, rev)), axes = FALSE)
par(opar)
box(bty = "o")

Exploring the colocation data

Coverage: comparing facebook population with census population

colocation$`2020-03-03` %>%
  select(polygon1_id, fb_population_1) %>% 
  distinct() %>% 
  left_join(select(districts, -lon, -lat), c("polygon1_id" = "polygon_id")) %>% 
  left_join(select(st_drop_geometry(vn2), -area), c("province", "district")) %$%
  plot(n, log10(fb_population_1))

Gravity model

One model fairly used in geography is the so-called gravity model that says that the connection between any 2 locations \(i\) and \(j\) is:

\[ C_{ij} = \theta\frac{P_i^{\tau_1}P_j^{\tau_2}}{d_{ij}^\rho} \] See for example 10.1126/science.1125237.

TO DO: test this model on the facebook data.

Effective distance

See 10.1126/science.1245200